# R Options
options(stringsAsFactors=FALSE)
# Required libraries
library(dplyr)
library(Seurat)
library(ggplot2)
library(openxlsx)
library(reticulate)
library(MAST)
library(biomaRt)
library(enrichR)
library(ggpubr)
library(knitr)
library(kableExtra)
library(dplyr)
library(tidyr)
library(ggsci)
use_python('/usr/bin/python')
# ggplot style
plot.mystyle = function(p, title=NULL, col=NULL, legend.title=NULL, legend.position=NULL) {
p = p + theme_light() + theme(panel.border = element_blank())
if(!is.null(title)) p = p + ggtitle(title) + theme(plot.title = element_text(hjust=0.5))
if(length(col) > 0) p = p + scale_fill_manual(values=col)
if(!is.null(legend.title)) {
p = p + labs(color=legend.title, fill=legend.title)
} else {
p = p + theme(legend.title = element_blank())
}
if(!is.null(legend.position)) p = p + theme(legend.position=legend.position)
return(p)
}Open this code chunk to read all parameters that are set specifically for your project.
param = list()
# Project ID
param$project = "pbmc"
# Project-specific paths
param$path = "~/scrnaseq_refDatasets/10x_pbmc_1k_healthyDonor_v3Chemistry/"
param$path.out = paste0(param$path, "/Seurat/")
if(!file.exists(param$path.out)) dir.create(param$path.out)
# Input data path in case Cell Ranger was run
param$path.data = "~/scrnaseq_refDatasets/10x_pbmc_1k_healthyDonor_v3Chemistry/filtered_feature_bc_matrix/"
param$file.mapping.stats = "~/scrnaseq_refDatasets/10x_pbmc_1k_healthyDonor_v3Chemistry/pbmc_1k_v3_metrics_summary.csv"
param$file.annot = "~/ensembl/hsapiens_gene_ensembl.98.csv"
# In case of HTO, readable names for Hashtag Oligos
param$hto.names = NULL
# Prefix of mitochondrial genes
param$mt = "^MT-"
# Biomart dataset to use for gene name translations
param$mart.dataset = "hsapiens_gene_ensembl"
# The number of PCs to use; adjust this parameter based on JackStraw and Elbowplot
param$pc.n = 7
# Resolution of clusters; low values will lead to fewer clusters of cells
param$cluster.resolution=0.5
# Thresholds to define differentially expressed genes
param$padj = 0.05
param$log2fc = log2(1.5)
# Marker genes based on literature
# https://icb-scanpy-tutorials.readthedocs-hosted.com/en/latest/visualizing-marker-genes.html
param$known.markers = list()
param$known.markers[["bcell"]] = c("CD79A", "MS4A1")
param$known.markers[["tcell"]] = "CD3D"
param$known.markers[["tcell.cd8+"]] = c("CD8A", "CD8B")
param$known.markers[["nk"]] = c("GNLY", "NKG7")
param$known.markers[["myeloid"]] = c("CST3", "LYZ")
param$known.markers[["monocytes"]] = "FCGR3A"
param$known.markers[["dendritic"]] = "FCER1A"
# Enrichr databases of interest
param$enrichr.dbs = c("GO_Molecular_Function_2018", "GO_Biological_Process_2018", "GO_Cellular_Component_2018")
# Main color to use for plots
param$col = "palevioletred"We begin by printing mapping statistics that have been produced prior to this workflow.
if(!is.null(param$file.mapping.stats)){
mapping.stats = as.data.frame(t(read.delim(param$file.mapping.stats, sep=",", header=TRUE, check.names=FALSE)))
colnames(mapping.stats) = "Value"
kable(mapping.stats, align="l", caption="Mapping statistics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width=FALSE, position="left")
} else {
message("Mapping statistics cannot be shown. No valid file provided.")
}| Value | |
|---|---|
| Estimated Number of Cells | 1,222 |
| Mean Reads per Cell | 54,502 |
| Median Genes per Cell | 1,919 |
| Number of Reads | 66,601,887 |
| Valid Barcodes | 97.4% |
| Sequencing Saturation | 70.8% |
| Q30 Bases in Barcode | 94.1% |
| Q30 Bases in RNA Read | 90.2% |
| Q30 Bases in Sample Index | 91.1% |
| Q30 Bases in UMI | 92.7% |
| Reads Mapped to Genome | 95.4% |
| Reads Mapped Confidently to Genome | 92.4% |
| Reads Mapped Confidently to Intergenic Regions | 4.8% |
| Reads Mapped Confidently to Intronic Regions | 31.1% |
| Reads Mapped Confidently to Exonic Regions | 56.5% |
| Reads Mapped Confidently to Transcriptome | 53.7% |
| Reads Mapped Antisense to Gene | 1.0% |
| Fraction Reads in Cells | 94.9% |
| Total Genes Detected | 18,391 |
| Median UMI Counts per Cell | 6,628 |
We next read the scRNA-seq counts table to initialise a Seurat object.
# Load the dataset
sc.data = Read10X(param$path.data)
# Initialize the Seurat object with the raw (non-normalized data).
sc = CreateSeuratObject(counts=sc.data, project=param$project, min.cells=3)
sc## An object of class Seurat
## 15247 features across 1222 samples within 1 assay
## Active assay: RNA (15247 features)
We finally read additional gene annotation from Ensembl, and translate Ensembl IDs to Entrez gene symbols. The resulting table is written to file.
# Read feature IDs: Ensembl ID, 10X gene symbol
gene.ids = read.delim(paste0(param$path.data, "features.tsv.gz"), header=FALSE)
gene.ids = gene.ids[gene.ids[,3]=="Gene Expression", 1:2]
colnames(gene.ids) = c("Ensembl", "GeneSymbol")
# Seurat does this as well, so we need to replicate it
gene.ids[,"GeneSymbol"] = make.unique(gene.ids[,"GeneSymbol"])
# When you call "CreateSeuratObject", underscores "_" in gene names are replaced with minus "-"
gene.ids = cbind(gene.ids, GeneSymbolEdited=gsub(gene.ids[,"GeneSymbol"], pattern="_", replacement="-", fixed=TRUE))
# Retrieve Entrez Gene Symbols; this is needed for EnrichR functional enrichment analysis
#mart = useMart("ensembl", dataset=param$mart.dataset)
#mapping = getBM(
# filters="ensembl_gene_id",
# attributes=c("ensembl_gene_id", "entrezgene_accession"),
# values=gene.ids[,"Ensembl"],
# mart=mart)
# Note that we get the first Entrez Symbol that matches the Ensembl ID
#mapping.match = match(gene.ids[,"Ensembl"], mapping[,"ensembl_gene_id"])
#gene.ids = cbind(gene.ids, EntrezSymbol=mapping[mapping.match,"entrezgene_accession"])
gene.ids = cbind(gene.ids, EntrezSymbol=gene.ids[,"GeneSymbol"])
# Write table
write.table(gene.ids, file=paste0(param$path.out, "/GeneIds.txt"), sep="\t", row.names=TRUE, col.names=TRUE, quote=FALSE)
# Create translation table
symbol.to.ensembl = setNames(gene.ids[,"Ensembl"], gene.ids[,"GeneSymbolEdited"])
symbol.to.entrez = setNames(gene.ids[,"EntrezSymbol"], gene.ids[,"GeneSymbolEdited"])
# Read Ensembl annotation
annot.ensembl = read.delim(param$file.annot, row.names=1)We start the analysis by removing unwanted cells from the dataset. Three commonly used QC metrics include the number of unique genes detected in each cell (“nFeature”), the total number of molecules detected in each cell (“nCount”), and the percentage of reads that map to the mitochrondrial genome (“percent.mt”).
# Calculate percentage of reads that map to mitochondrial genome
sc <- PercentageFeatureSet(sc, pattern=param$mt, col.name="percent.mt")
# Metadata
kable(head(sc@meta.data, 5), align="l", caption="Meta-data, top 5 rows") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width=FALSE, position="left")| orig.ident | nCount_RNA | nFeature_RNA | percent.mt | |
|---|---|---|---|---|
| AAACCCAAGGAGAGTA | pbmc | 8286 | 2618 | 10.777215 |
| AAACGCTTCAGCCCAG | pbmc | 5509 | 1805 | 7.968778 |
| AAAGAACAGACGACTG | pbmc | 4280 | 1559 | 6.191589 |
| AAAGAACCAATGGCAG | pbmc | 2754 | 1225 | 5.991285 |
| AAAGAACGTCTGCAAT | pbmc | 6589 | 1828 | 6.617089 |
# Filter cutoffs
features.cut = c(nFeature_RNA=200, nCount_RNA=NA, percent.mt=20)
# Plot QC metrics
features = c("nFeature_RNA", "nCount_RNA", "percent.mt")
p = VlnPlot(sc, features=features, pt.size=0, col=param$col, combine=FALSE)
names(p) = features
for(i in features) {
p[[i]] = plot.mystyle(p[[i]], title=i, legend.position="none") + xlab("")
if(!is.na(features.cut[i])) p[[i]] = p[[i]] + geom_hline(yintercept=features.cut[i], lty=2, col="darkgrey")
}
p = CombinePlots(p, ncol=3)
p = plot.mystyle(p, title="Distribution of feature values")
p # Correlate QC metrics
p = list()
p[[1]] = FeatureScatter(sc, feature1=features[2], feature2=features[1], cols=param$col)
p[[2]] = FeatureScatter(sc, feature1=features[2], feature2=features[3], cols=param$col)
for(i in 1:length(p)) p[[i]] = plot.mystyle(p[[i]], legend.position="none")
p = CombinePlots(p)
p = plot.mystyle(p, title="Features plotted against each other")
p# Actual filtering
sc = subset(sc, subset=nFeature_RNA>features.cut["nFeature_RNA"] & percent.mt<features.cut["percent.mt"])
sc## An object of class Seurat
## 15247 features across 1113 samples within 1 assay
## Active assay: RNA (15247 features)
Feature selection: For downstream analysis it is beneficial to focus on genes that exhibit high cell-to-cell variation, that is they are highly expressed in some cells and lowly expressed in others.
Scaling: To be able to compare normalised gene counts between genes, gene counts are further scaled to have zero mean and unit variance (z-score). This way, genes are equally weighted for downstream analysis.
# Normalise data the original way
sc = NormalizeData(sc, normalization.method = "LogNormalize", scale.factor = 10000)
# Select features from normalised data
sc = FindVariableFeatures(sc, selection.method = "vst", nfeatures = 2000)
# Scale normalised data
all.genes = rownames(sc)
sc = ScaleData(sc, features=all.genes)# Run sctransform
# This is a new normalisation method that replaces previous Seurat functions 'NormalizeData', 'FindVariableFeatures', and 'ScaleData'.
# vignette: https://satijalab.org/seurat/v3.0/sctransform_vignette.html
# paper: https://www.biorxiv.org/content/10.1101/576827v2
# normalised data end up here: sc@assays$SCT@data
sc = SCTransform(sc)# Show variable genes
top10 = head(VariableFeatures(sc), 10)
# Plot variable features with and without labels
p1 = VariableFeaturePlot(sc, cols=c("grey", param$col))
p1 = plot.mystyle(p1)
p2 = LabelPoints(plot=p1, points=top10, repel=TRUE, cols=c("grey", param$col))
p = CombinePlots(plots=list(p1, p2), legend="bottom")
p = plot.mystyle(p, title="Variable genes without (left) and with (right) labels")
pA single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. The biological manifold however can be described by far fewer dimensions than the number of genes. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarise a dataset, and to visualise a dataset.
We use Principal Component Analysis (PCA) to summarise a dataset, overcoming noise and reducing the data to its essential components. Each principal component (PC) represents a “metafeature” that combines information across a correlated gene set. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualise the dataset, placing similar cells together in 2D space, see below.
To decide how many PCs to include in downstream analyses, we visualize cells and genes that define the PCA.
sc = RunPCA(sc, features=VariableFeatures(object=sc))
p = VizDimLoadings(sc, dims=1:2, reduction="pca", col=param$col, combine=FALSE)
for(i in 1:length(p)) p[[i]] = plot.mystyle(p[[i]])
p = CombinePlots(p)
p = plot.mystyle(p, title="Top gene loadings of the first two PCs")
pif(run.hto) {
p = DimPlot(sc, reduction="pca", cols=param$col.hto.collapsed)
} else {
p = DimPlot(sc, reduction="pca")
}
p = plot.mystyle(p, title="Cells arranged by the first two PCs", legend.title="HTO")
pWe next need to decide how many PCs we want to use for downstream analyses. The following two plots are designed to help us make an informed decision.
The first plot is based on the “JackStraw” procedure: parts of the data is repeatedly randomly permuted and PCA is rerun, generating a “null distribution” of feature scores. Significant PCs are those with a strong enrichment of low p-value features.
The second plot is an “Elbow plot”: PCs are ranked based on the percentage of variance they explain.
For your dataset, we decided to go for 7 PCs.
# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time
sc = JackStraw(sc, num.replicate=100, dims=20)
sc = ScoreJackStraw(sc, dims=1:20)
p = JackStrawPlot(sc, dims=1:20)
p = plot.mystyle(p, title="Jack Straw plot", legend.position="bottom")
pSeurat’s clustering method first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm.
# Note: I changed the seed in ./lib/python3.6/site-packages/leidenalg/functions.py to 11 for reproducibility
# The number of clusters can be optimized by tuning 'resolution' -> based on feedback from the client whether or not clusters make sense
# Choose the number of PCs to use for clustering
sc = FindNeighbors(sc, dims=1:param$pc.n)
# Cluster using the Leiden algorithm
# Paper to Leiden algorithm: https://www.nature.com/articles/s41598-019-41695-z
# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
sc = FindClusters(sc, resolution=param$cluster.resolution, algorithm=4)We use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.
Take care not to mis-read a UMAP:
For a nice read to intuitively understand UMAP, see https://pair-code.github.io/understanding-umap/.
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
# Note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
cluster.cells = table(sc@active.ident)
cluster.labels = paste0(levels(sc@active.ident)," (", cluster.cells[levels(sc@active.ident)],")")
p = DimPlot(sc, reduction="umap", label=TRUE) + scale_colour_discrete("Cluster", labels=cluster.labels)
p = plot.mystyle(p, "UMAP, cells coloured by cluster identity", legend.position="bottom")
pDo cells in individual clusters have particularly high counts, detected genes or mitochondrial content?
p = FeaturePlot(sc, features=features, cols=c("lightgrey", param$col), combine=FALSE)
names(p) = features
for(i in features) p[[i]] = plot.mystyle(p[[i]], title=i)
CombinePlots(p, ncol=2)Do cells in individual clusters express provided known marker genes?
g = unique(unlist(param$known.markers))
g = g[length(g):1]
d = DotPlot(sc, features=g, cols=c("lightgrey", param$col))
d + theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))
for(i in 1:length(param$known.markers)) {
f = FeaturePlot(sc, features=param$known.markers[[i]], cols=c("lightgrey", param$col))
f = ggpubr::annotate_figure(p=f, top = ggpubr::text_grob(label=names(param$known.markers)[i], face='bold'))
print(f)
}We next identify genes that are differentially expressed in one cluster compared to all other clusters. Additional gene annotation is added, and the resulting tables are written to file.
# Find markers for every cluster compared to all remaining cells, report only the positive ones
# min.pct = requires feature to be detected at this minimum percentage in either of the two groups of cells
# logfc.threshold = requires a feature to be differentially expressed on average by some amount between the two groups
# only.pos = find only positive markers
# Review recommends using "MAST"; Mathias uses "LR"
# ALWAYS USE: assay="RNA" or assay="SCT"
# DONT USE: assay=integrated datasets; this data is normalised and contains only 2k genes
sc.markers = FindAllMarkers(sc, assay="RNA", only.pos=FALSE, min.pct=0.25, logfc.threshold=0.25, test.use="MAST")
sc.markers.top2 = sc.markers %>% group_by(cluster) %>% top_n(n=2, wt=avg_logFC) %>% as.data.frame
kable(sc.markers.top2, align="l", caption="Top 2 DEGs per cell cluster") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width=FALSE, position="left")| p_val | avg_logFC | pct.1 | pct.2 | p_val_adj | cluster | gene |
|---|---|---|---|---|---|---|
| 0 | 3.8292753 | 1.000 | 0.156 | 0 | 1 | S100A8 |
| 0 | 3.4714042 | 1.000 | 0.204 | 0 | 1 | S100A9 |
| 0 | 1.3654774 | 0.938 | 0.255 | 0 | 2 | IL7R |
| 0 | 1.0531971 | 0.964 | 0.341 | 0 | 2 | TRAC |
| 0 | 3.6609018 | 0.871 | 0.025 | 0 | 3 | IGHM |
| 0 | 4.9139410 | 0.796 | 0.237 | 0 | 3 | IGKC |
| 0 | 0.9279829 | 0.759 | 0.154 | 0 | 4 | CCR7 |
| 0 | 0.9427734 | 0.927 | 0.499 | 0 | 4 | NOSIP |
| 0 | 2.0182653 | 0.837 | 0.049 | 0 | 5 | GZMK |
| 0 | 1.8890308 | 0.748 | 0.140 | 0 | 5 | KLRB1 |
| 0 | 1.7451108 | 1.000 | 0.293 | 0 | 6 | CST3 |
| 0 | 1.6522173 | 1.000 | 0.514 | 0 | 6 | HLA-DPA1 |
| 0 | 3.8593508 | 0.980 | 0.073 | 0 | 7 | GNLY |
| 0 | 2.9277180 | 1.000 | 0.187 | 0 | 7 | NKG7 |
| 0 | 5.0496622 | 1.000 | 0.237 | 0 | 8 | NRGN |
| 0 | 5.9002407 | 1.000 | 0.042 | 0 | 8 | PPBP |
# Add Ensembl annotation
sc.markers.ensembl = symbol.to.ensembl[sc.markers[,"gene"]]
sc.markers.annot = cbind(sc.markers, annot.ensembl[sc.markers.ensembl,])
# Output in Excel sheet
sc.markers.lst = lapply(levels(sc.markers.annot$cluster), function(x){sc.markers.annot %>% dplyr::filter(cluster==x)})
names(sc.markers.lst) = paste0("cluster", levels(sc.markers$cluster))
write.xlsx(sc.markers.lst, file=paste0(param$path.out, "Markers.xlsx"))
# Filter markers based on p-value and fold-change
sc.markers.filt = sc.markers %>% dplyr::filter(p_val_adj <= param$padj) %>% dplyr::filter((avg_logFC <= -param$log2fc) | (avg_logFC >= param$log2fc)) %>% as.data.frame
sc.markers.filt.down = sc.markers.filt %>% dplyr::filter(avg_logFC <= -param$log2fc) %>% as.data.frame
sc.markers.filt.up = sc.markers.filt %>% dplyr::filter(avg_logFC >= param$log2fc) %>% as.data.frame
# Number of DEGs per cluster
cluster.all = sort(unique(sc.markers[,"cluster"]))
sc.markers.filt.n = cbind(Cluster=cluster.all,
sc.markers.filt.down %>% dplyr::count(cluster) %>% transmute(Down=n),
sc.markers.filt.up %>% dplyr::count(cluster) %>% transmute(Up=n)) %>%
pivot_longer(cols=c("Down", "Up"), names_to="Direction", values_to="n")
p = ggplot(sc.markers.filt.n, aes(x=Cluster, y=n, fill=Direction)) + geom_bar(stat="identity")
p = plot.mystyle(p, title=paste0("Number of DEGs per cell cluster\n(FC=", 2^param$log2fc, ", adj. p-value=", param$padj, ")"), col=c("steelblue", "darkgoldenrod1"))
pThe following plots are exemplary to how we can visualize differentially expressed genes using the “Seurat” R-package. The selected genes are the top differentially expressed genes for cluster 1 to 3, respectively.
# Get top gene per cluster and plot
genes.example = sc.markers %>% group_by(cluster) %>% top_n(n=1, wt=avg_logFC) %>% pull(gene)
genes.example = genes.example[1:3]
# Shows gene expression on the UMAP
p = FeaturePlot(sc, features=genes.example, cols=c("lightgrey", param$col), combine=FALSE)
names(p) = genes.example
for(i in names(p)) p[[i]] = plot.mystyle(p[[i]], title=i)
p = CombinePlots(p)
p = p + ggtitle("UMAP, cells coloured by normalised gene expression data") + theme(plot.title = element_text(hjust=0.5))
p# Ridge plot of normalised gene expression data
p = RidgePlot(sc, features=genes.example, combine=FALSE)
names(p) = genes.example
for(i in names(p)) p[[i]] = plot.mystyle(p[[i]], title=i, legend.title="Cell classification")
p = CombinePlots(p, legend="bottom", ncol=2)
p = plot.mystyle(p, "Ridge plot of normalised gene expression data")
p# Ridge plot of raw gene expression counts
p = RidgePlot(sc, features=genes.example[1:3], slot="counts", combine=FALSE)
names(p) = genes.example
for(i in names(p)) p[[i]] = plot.mystyle(p[[i]], title=i, legend.title="Cell classification")
p = CombinePlots(p, legend="bottom", ncol=2)
p = plot.mystyle(p, title="Ridge plot of raw gene expression counts")
p# Visualises how feature expression changes across different clusters
p = DotPlot(sc, features=genes.example[3:1], cols=c("lightgrey", param$col))
p = plot.mystyle(p, title="Dot plot of normalised gene expression data")
p# Heatmap of top differentially expressed genes
top = sc.markers %>% group_by(cluster) %>% top_n(n=10, wt=avg_logFC)
DoHeatmap(sc, features=top$gene) + NoLegend()## Warning in DoHeatmap(sc, features = top$gene): The following features were
## omitted as they were not found in the scale.data slot for the SCT assay: NPM1
To gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms amongst up- and down-regulated genes of each cluster. Over-represented terms are written to file.
We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website https://amp.pharm.mssm.edu/Enrichr/. You can choose to test functional enrichment from a wide range of databases:
dbs.all = listEnrichrDbs()
kable(dbs.all, align="l", caption="Enrichr databases") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width=FALSE, position="left") %>%
scroll_box(width = "100%", height = "300px")| geneCoverage | genesPerTerm | libraryName | link | numTerms |
|---|---|---|---|---|
| 13362 | 275 | Genome_Browser_PWMs | http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ | 615 |
| 27884 | 1284 | TRANSFAC_and_JASPAR_PWMs | http://jaspar.genereg.net/html/DOWNLOAD/ | 326 |
| 6002 | 77 | Transcription_Factor_PPIs | 290 | |
| 47172 | 1370 | ChEA_2013 | http://amp.pharm.mssm.edu/lib/cheadownload.jsp | 353 |
| 47107 | 509 | Drug_Perturbations_from_GEO_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 701 |
| 21493 | 3713 | ENCODE_TF_ChIP-seq_2014 | http://genome.ucsc.edu/ENCODE/downloads.html | 498 |
| 1295 | 18 | BioCarta_2013 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 249 |
| 3185 | 73 | Reactome_2013 | http://www.reactome.org/download/index.html | 78 |
| 2854 | 34 | WikiPathways_2013 | http://www.wikipathways.org/index.php/Download_Pathways | 199 |
| 15057 | 300 | Disease_Signatures_from_GEO_up_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 |
| 4128 | 48 | KEGG_2013 | http://www.kegg.jp/kegg/download/ | 200 |
| 34061 | 641 | TF-LOF_Expression_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 269 |
| 7504 | 155 | TargetScan_microRNA | http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61 | 222 |
| 16399 | 247 | PPI_Hub_Proteins | http://amp.pharm.mssm.edu/X2K | 385 |
| 12753 | 57 | GO_Molecular_Function_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 1136 |
| 23726 | 127 | GeneSigDB | http://genesigdb.org/genesigdb/downloadall.jsp | 2139 |
| 32740 | 85 | Chromosome_Location | http://software.broadinstitute.org/gsea/msigdb/index.jsp | 386 |
| 13373 | 258 | Human_Gene_Atlas | http://biogps.org/downloads/ | 84 |
| 19270 | 388 | Mouse_Gene_Atlas | http://biogps.org/downloads/ | 96 |
| 13236 | 82 | GO_Cellular_Component_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 641 |
| 14264 | 58 | GO_Biological_Process_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 5192 |
| 3096 | 31 | Human_Phenotype_Ontology | http://www.human-phenotype-ontology.org/ | 1779 |
| 22288 | 4368 | Epigenomics_Roadmap_HM_ChIP-seq | http://www.roadmapepigenomics.org/ | 383 |
| 4533 | 37 | KEA_2013 | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 474 |
| 10231 | 158 | NURSA_Human_Endogenous_Complexome | https://www.nursa.org/nursa/index.jsf | 1796 |
| 2741 | 5 | CORUM | http://mips.helmholtz-muenchen.de/genre/proj/corum/ | 1658 |
| 5655 | 342 | SILAC_Phosphoproteomics | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 84 |
| 10406 | 715 | MGI_Mammalian_Phenotype_Level_3 | http://www.informatics.jax.org/ | 71 |
| 10493 | 200 | MGI_Mammalian_Phenotype_Level_4 | http://www.informatics.jax.org/ | 476 |
| 11251 | 100 | Old_CMAP_up | http://www.broadinstitute.org/cmap/ | 6100 |
| 8695 | 100 | Old_CMAP_down | http://www.broadinstitute.org/cmap/ | 6100 |
| 1759 | 25 | OMIM_Disease | http://www.omim.org/downloads | 90 |
| 2178 | 89 | OMIM_Expanded | http://www.omim.org/downloads | 187 |
| 851 | 15 | VirusMINT | http://mint.bio.uniroma2.it/download.html | 85 |
| 10061 | 106 | MSigDB_Computational | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 858 |
| 11250 | 166 | MSigDB_Oncogenic_Signatures | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 189 |
| 15406 | 300 | Disease_Signatures_from_GEO_down_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 |
| 17711 | 300 | Virus_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 323 |
| 17576 | 300 | Virus_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 323 |
| 15797 | 176 | Cancer_Cell_Line_Encyclopedia | https://portals.broadinstitute.org/ccle/home | 967 |
| 12232 | 343 | NCI-60_Cancer_Cell_Lines | http://biogps.org/downloads/ | 93 |
| 13572 | 301 | Tissue_Protein_Expression_from_ProteomicsDB | https://www.proteomicsdb.org/ | 207 |
| 6454 | 301 | Tissue_Protein_Expression_from_Human_Proteome_Map | http://www.humanproteomemap.org/index.php | 30 |
| 3723 | 47 | HMDB_Metabolites | http://www.hmdb.ca/downloads | 3906 |
| 7588 | 35 | Pfam_InterPro_Domains | ftp://ftp.ebi.ac.uk/pub/databases/interpro/ | 311 |
| 7682 | 78 | GO_Biological_Process_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 941 |
| 7324 | 172 | GO_Cellular_Component_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 205 |
| 8469 | 122 | GO_Molecular_Function_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 402 |
| 13121 | 305 | Allen_Brain_Atlas_up | http://www.brain-map.org/ | 2192 |
| 26382 | 1811 | ENCODE_TF_ChIP-seq_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 816 |
| 29065 | 2123 | ENCODE_Histone_Modifications_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 412 |
| 280 | 9 | Phosphatase_Substrates_from_DEPOD | http://www.koehn.embl.de/depod/ | 59 |
| 13877 | 304 | Allen_Brain_Atlas_down | http://www.brain-map.org/ | 2192 |
| 15852 | 912 | ENCODE_Histone_Modifications_2013 | http://genome.ucsc.edu/ENCODE/downloads.html | 109 |
| 4320 | 129 | Achilles_fitness_increase | http://www.broadinstitute.org/achilles | 216 |
| 4271 | 128 | Achilles_fitness_decrease | http://www.broadinstitute.org/achilles | 216 |
| 10496 | 201 | MGI_Mammalian_Phenotype_2013 | http://www.informatics.jax.org/ | 476 |
| 1678 | 21 | BioCarta_2015 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 239 |
| 756 | 12 | HumanCyc_2015 | http://humancyc.org/ | 125 |
| 3800 | 48 | KEGG_2015 | http://www.kegg.jp/kegg/download/ | 179 |
| 2541 | 39 | NCI-Nature_2015 | http://pid.nci.nih.gov/ | 209 |
| 1918 | 39 | Panther_2015 | http://www.pantherdb.org/ | 104 |
| 5863 | 51 | WikiPathways_2015 | http://www.wikipathways.org/index.php/Download_Pathways | 404 |
| 6768 | 47 | Reactome_2015 | http://www.reactome.org/download/index.html | 1389 |
| 25651 | 807 | ESCAPE | http://www.maayanlab.net/ESCAPE/ | 315 |
| 19129 | 1594 | HomoloGene | http://www.ncbi.nlm.nih.gov/homologene | 12 |
| 23939 | 293 | Disease_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 839 |
| 23561 | 307 | Disease_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 839 |
| 23877 | 302 | Drug_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 906 |
| 15886 | 9 | Genes_Associated_with_NIH_Grants | https://grants.nih.gov/grants/oer.htm | 32876 |
| 24350 | 299 | Drug_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 906 |
| 3102 | 25 | KEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 428 |
| 31132 | 298 | Gene_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 2460 |
| 30832 | 302 | Gene_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 2460 |
| 48230 | 1429 | ChEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 395 |
| 5613 | 36 | dbGaP | http://www.ncbi.nlm.nih.gov/gap | 345 |
| 9559 | 73 | LINCS_L1000_Chem_Pert_up | https://clue.io/ | 33132 |
| 9448 | 63 | LINCS_L1000_Chem_Pert_down | https://clue.io/ | 33132 |
| 16725 | 1443 | GTEx_Tissue_Sample_Gene_Expression_Profiles_down | http://www.gtexportal.org/ | 2918 |
| 19249 | 1443 | GTEx_Tissue_Sample_Gene_Expression_Profiles_up | http://www.gtexportal.org/ | 2918 |
| 15090 | 282 | Ligand_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 261 |
| 16129 | 292 | Aging_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 286 |
| 15309 | 308 | Aging_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 286 |
| 15103 | 318 | Ligand_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 261 |
| 15022 | 290 | MCF7_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 401 |
| 15676 | 310 | MCF7_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 401 |
| 15854 | 279 | Microbe_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 312 |
| 15015 | 321 | Microbe_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 312 |
| 3788 | 159 | LINCS_L1000_Ligand_Perturbations_down | https://clue.io/ | 96 |
| 3357 | 153 | LINCS_L1000_Ligand_Perturbations_up | https://clue.io/ | 96 |
| 12668 | 300 | L1000_Kinase_and_GPCR_Perturbations_down | https://clue.io/ | 3644 |
| 12638 | 300 | L1000_Kinase_and_GPCR_Perturbations_up | https://clue.io/ | 3644 |
| 8973 | 64 | Reactome_2016 | http://www.reactome.org/download/index.html | 1530 |
| 7010 | 87 | KEGG_2016 | http://www.kegg.jp/kegg/download/ | 293 |
| 5966 | 51 | WikiPathways_2016 | http://www.wikipathways.org/index.php/Download_Pathways | 437 |
| 15562 | 887 | ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X | 104 | |
| 17850 | 300 | Kinase_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 285 |
| 17660 | 300 | Kinase_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 285 |
| 1348 | 19 | BioCarta_2016 | http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 237 |
| 934 | 13 | HumanCyc_2016 | http://humancyc.org/ | 152 |
| 2541 | 39 | NCI-Nature_2016 | http://pid.nci.nih.gov/ | 209 |
| 2041 | 42 | Panther_2016 | http://www.pantherdb.org/pathway/ | 112 |
| 5209 | 300 | DrugMatrix | https://ntp.niehs.nih.gov/drugmatrix/ | 7876 |
| 49238 | 1550 | ChEA_2016 | http://amp.pharm.mssm.edu/Enrichr | 645 |
| 2243 | 19 | huMAP | http://proteincomplexes.org/ | 995 |
| 19586 | 545 | Jensen_TISSUES | http://tissues.jensenlab.org/ | 1842 |
| 22440 | 505 | RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 1302 |
| 8184 | 24 | MGI_Mammalian_Phenotype_2017 | http://www.informatics.jax.org/ | 5231 |
| 18329 | 161 | Jensen_COMPARTMENTS | http://compartments.jensenlab.org/ | 2283 |
| 15755 | 28 | Jensen_DISEASES | http://diseases.jensenlab.org/ | 1811 |
| 10271 | 22 | BioPlex_2017 | http://bioplex.hms.harvard.edu/ | 3915 |
| 10427 | 38 | GO_Cellular_Component_2017 | http://www.geneontology.org/ | 636 |
| 10601 | 25 | GO_Molecular_Function_2017 | http://www.geneontology.org/ | 972 |
| 13822 | 21 | GO_Biological_Process_2017 | http://www.geneontology.org/ | 3166 |
| 8002 | 143 | GO_Cellular_Component_2017b | http://www.geneontology.org/ | 816 |
| 10089 | 45 | GO_Molecular_Function_2017b | http://www.geneontology.org/ | 3271 |
| 13247 | 49 | GO_Biological_Process_2017b | http://www.geneontology.org/ | 10125 |
| 21809 | 2316 | ARCHS4_Tissues | http://amp.pharm.mssm.edu/archs4 | 108 |
| 23601 | 2395 | ARCHS4_Cell-lines | http://amp.pharm.mssm.edu/archs4 | 125 |
| 20883 | 299 | ARCHS4_IDG_Coexp | http://amp.pharm.mssm.edu/archs4 | 352 |
| 19612 | 299 | ARCHS4_Kinases_Coexp | http://amp.pharm.mssm.edu/archs4 | 498 |
| 25983 | 299 | ARCHS4_TFs_Coexp | http://amp.pharm.mssm.edu/archs4 | 1724 |
| 19500 | 137 | SysMyo_Muscle_Gene_Sets | http://sys-myo.rhcloud.com/ | 1135 |
| 14893 | 128 | miRTarBase_2017 | http://mirtarbase.mbc.nctu.edu.tw/ | 3240 |
| 17598 | 1208 | TargetScan_microRNA_2017 | http://www.targetscan.org/ | 683 |
| 5902 | 109 | Enrichr_Libraries_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 121 |
| 12486 | 299 | Enrichr_Submissions_TF-Gene_Coocurrence | http://amp.pharm.mssm.edu/Enrichr | 1722 |
| 1073 | 100 | Data_Acquisition_Method_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 12 |
| 19513 | 117 | DSigDB | http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/ | 4026 |
| 14433 | 36 | GO_Biological_Process_2018 | http://www.geneontology.org/ | 5103 |
| 8655 | 61 | GO_Cellular_Component_2018 | http://www.geneontology.org/ | 446 |
| 11459 | 39 | GO_Molecular_Function_2018 | http://www.geneontology.org/ | 1151 |
| 19741 | 270 | TF_Perturbations_Followed_by_Expression | http://www.ncbi.nlm.nih.gov/geo/ | 1958 |
| 27360 | 802 | Chromosome_Location_hg19 | http://hgdownload.cse.ucsc.edu/downloads.html | 36 |
| 13072 | 26 | NIH_Funded_PIs_2017_Human_GeneRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 5687 |
| 13464 | 45 | NIH_Funded_PIs_2017_Human_AutoRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 |
| 13787 | 200 | Rare_Diseases_AutoRIF_ARCHS4_Predictions | https://amp.pharm.mssm.edu/geneshot/ | 3725 |
| 13929 | 200 | Rare_Diseases_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 |
| 16964 | 200 | NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 |
| 17258 | 200 | NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 5684 |
| 10352 | 58 | Rare_Diseases_GeneRIF_Gene_Lists | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 |
| 10471 | 76 | Rare_Diseases_AutoRIF_Gene_Lists | https://amp.pharm.mssm.edu/geneshot/ | 3725 |
| 12419 | 491 | SubCell_BarCode | http://www.subcellbarcode.org/ | 104 |
| 19378 | 37 | GWAS_Catalog_2019 | https://www.ebi.ac.uk/gwas | 1737 |
| 6201 | 45 | WikiPathways_2019_Human | https://www.wikipathways.org/ | 472 |
| 4558 | 54 | WikiPathways_2019_Mouse | https://www.wikipathways.org/ | 176 |
| 3264 | 22 | TRRUST_Transcription_Factors_2019 | https://www.grnpedia.org/trrust/ | 571 |
| 7802 | 92 | KEGG_2019_Human | https://www.kegg.jp/ | 308 |
| 8551 | 98 | KEGG_2019_Mouse | https://www.kegg.jp/ | 303 |
| 12444 | 23 | InterPro_Domains_2019 | https://www.ebi.ac.uk/interpro/ | 1071 |
| 9000 | 20 | Pfam_Domains_2019 | https://pfam.xfam.org/ | 608 |
| 7744 | 363 | DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 | https://depmap.org/ | 558 |
| 6204 | 387 | DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 | https://depmap.org/ | 325 |
| 13420 | 32 | MGI_Mammalian_Phenotype_Level_4_2019 | http://www.informatics.jax.org/ | 5261 |
| 14148 | 122 | UK_Biobank_GWAS_v1 | https://www.ukbiobank.ac.uk/tag/gwas/ | 857 |
| 9813 | 49 | BioPlanet_2019 | https://tripod.nih.gov/bioplanet/ | 1510 |
| 1397 | 13 | ClinVar_2019 | https://www.ncbi.nlm.nih.gov/clinvar/ | 182 |
| 9116 | 22 | PheWeb_2019 | http://pheweb.sph.umich.edu/ | 1161 |
| 17464 | 63 | DisGeNET | https://www.disgenet.org | 9828 |
| 394 | 73 | HMS_LINCS_KinomeScan | http://lincs.hms.harvard.edu/kinomescan/ | 148 |
| 11851 | 586 | CCLE_Proteomics_2020 | https://portals.broadinstitute.org/ccle | 378 |
| 8189 | 421 | ProteomicsDB_2020 | https://www.proteomicsdb.org/ | 913 |
| 18704 | 100 | lncHUB_lncRNA_Co-Expression | https://amp.pharm.mssm.edu/lnchub/ | 3729 |
| 5605 | 39 | Virus-Host_PPI_P-HIPSTer_2020 | http://phipster.org/ | 6715 |
| 5718 | 31 | Elsevier_Pathway_Collection | http://www.transgene.ru/disease-pathways/ | 1721 |
# DEGs up and down per cluster
cluster.all = sort(unique(sc.markers[,"cluster"]))
genesets.up = lapply(cluster.all, function(x) {
tmp = sc.markers.filt.up %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
unique(na.exclude(symbol.to.entrez[tmp]))
})
genesets.down = lapply(cluster.all, function(x) {
tmp = sc.markers.filt.down %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
unique(na.exclude(symbol.to.entrez[tmp]))
})
names(genesets.up) = paste0("DEG_up_cluster_", cluster.all)
names(genesets.down) = paste0("DEG_down_cluster_", cluster.all)
genesets = c(genesets.up, genesets.down)
# Loop through gene lists
enriched = list()
for(i in 1:length(genesets)) {
if(length(genesets[[i]]) >= 3) {
message("Geneset ", names(genesets)[i])
enriched[[i]] = enrichr(genesets[[i]], databases=param$enrichr.dbs)
} else {
message("Geneset ", names(genesets)[i], " has less than 3 genes; skip enrichr")
enriched[[i]] = NULL
}
}
names(enriched) = names(genesets)
# Write enrichment results to file
enriched.write = unlist(enriched, recursive=FALSE)
for(i in 1:length(enriched)) {
if(!is.null(enriched[[i]])) {
write.xlsx(enriched[[i]], file=paste0(param$path.out, "/Functions_", names(enriched)[i], ".xlsx"))
}
}How much do gene expression profiles in your dataset reflect the cell cycle phases the single cells were in? We determine the effects of cell cycle heterogeneity by calculating a score for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in Tirosh et al. 2016, and human gene names are translated using biomaRt.
# Use biomart to translate human cell cycle genes to the species of interest
mart.human = useMart("ensembl", dataset="hsapiens_gene_ensembl")
mart.myspecies = useMart("ensembl", dataset=param$mart.dataset)
genes.s = getLDS(attributes=c("external_gene_name"), filters="external_gene_name", values=cc.genes.updated.2019$s.genes, mart=mart.human, attributesL=c("external_gene_name"), martL=mart.myspecies, uniqueRows=TRUE)
genes.g2m = getLDS(attributes=c("external_gene_name"), filters="external_gene_name", values=cc.genes.updated.2019$g2m.genes, mart=mart.human, attributesL=c("external_gene_name"), martL=mart.myspecies, uniqueRows=TRUE)
# Determine cell cycle effect
sc = CellCycleScoring(object=sc, s.features=genes.s[,2], g2m.features=genes.g2m[,2], set.ident=FALSE)## Warning: The following features are not present in the object: EXO1, DTL, E2F8,
## CDC45, DSCC1, RAD51, not searching for symbol synonyms
## Warning: The following features are not present in the object: KIF2C, CDK1,
## PIMREG, DLGAP5, TTK, CDC25C, HMMR, NEK2, HJURP, GTSE1, KIF11, CDCA3, CKAP2L,
## CDCA2, CCNB2, UBE2C, CDC20, BUB1, CENPA, ANLN, not searching for symbol synonyms
# Get a feeling for how many cells are affected
p1 = ggplot(sc@meta.data, aes(x=S.Score, y=G2M.Score, colour=Phase)) +
geom_point() +
scale_x_continuous("G1/S score") +
scale_y_continuous("G2/M score")
p1 = plot.mystyle(p1)
p2 = ggplot(sc@meta.data %>% group_by(seurat_clusters,Phase) %>% summarise(num_reads=length(Phase)), aes(x=seurat_clusters, y=num_reads, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_x_discrete("Seurat clusters") +
scale_y_continuous("Fraction of cells")
p2 = plot.mystyle(p2)
p = CombinePlots(plots=list(p1, p2), legend="bottom")
p = plot.mystyle(p, title="Cell cycle phases")
p# UMAP with phases superimposed
p3 = DimPlot(sc, group.by="Phase", pt.size=1)
plot.mystyle(p3, title="UMAP, cells coloured by cell cycle phases", legend.title="Phase")p3 = FeaturePlot(sc, features="S.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col))
p3 = plot.mystyle(p3, title="UMAP, cells coloured by S phase")
p4 = FeaturePlot(sc, features="G2M.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col))
p4 = plot.mystyle(p4, title="UMAP, cells coloured by G2M phase")
CombinePlots(plots=list(p3, p4))We export the UMAP 2D visualisation, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser.
# Export UMAP coordinates
loupe.umap = as.data.frame(sc@reductions$umap@cell.embeddings)
loupe.umap$Barcode = gsub(pattern="_", replacement="-", x=rownames(loupe.umap), fixed=TRUE)
if(length(grep(pattern="-$", x=loupe.umap$Barcode, perl=TRUE))==0) loupe.umap$Barcode=paste0(loupe.umap$Barcode, "-1")
loupe.umap = loupe.umap[, c("Barcode", "UMAP_1", "UMAP_2")]
colnames(loupe.umap) = c("Barcode", "UMAP-1", "UMAP-2")
write.table(loupe.umap, file=paste0(param$path.out, "/Seurat2Loupe.Umap.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export categorical metadata
meta.to.export = c("orig.ident", "seurat_clusters", "Phase")
if(run.hto) meta.to.export = c(meta.to.export, "HTO_maxID", "HTO_classification", "HTO_classification.global", "hash.ID")
loupe.meta = as.data.frame(sc@meta.data[,meta.to.export])
loupe.meta = cbind(Barcode=gsub(pattern="_", replacement="-", rownames(loupe.meta), fixed=TRUE), loupe.meta)
if(length(grep(pattern="-$", x=loupe.meta$Barcode, perl=TRUE))==0) loupe.meta$Barcode=paste0(loupe.meta$Barcode, "-1")
write.table(x=loupe.meta, file=paste0(param$path.out, "/Seurat2Loupe.metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export gene sets
loupe.genesets = data.frame(List=paste0("DEG_up_cluster_", sc.markers.filt.up[,"cluster"]),
Name=sc.markers.filt.up[,"gene"],
Ensembl=symbol.to.ensembl[sc.markers.filt.up[,"gene"]])
loupe.genesets = rbind(loupe.genesets,
data.frame(List=paste0("DEG_down_cluster_", sc.markers.filt.down[,"cluster"]),
Name=sc.markers.filt.down[,"gene"],
Ensembl=symbol.to.ensembl[sc.markers.filt.down[,"gene"]]))
genesets.to.export = list(genes.cc.s.phase=genes.s[,2], genes.cc.g2m.phase=genes.g2m[,2])
for(i in names(genesets.to.export)){
tmp.genes = genesets.to.export[[i]]
tmp.genes = tmp.genes[tmp.genes %in% names(symbol.to.ensembl)]
loupe.genesets = rbind(loupe.genesets,
data.frame(List=i,
Name=tmp.genes,
Ensembl=symbol.to.ensembl[tmp.genes]))
}
write.table(loupe.genesets, file=paste0(param$path.out, "/Seurat2Loupe.genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")All files generated with this report are written into the Seurat output folder: